{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 🔴 Understanding `Tokenization` in NLP!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1️⃣ Character Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['W', 'e', ' ', 'l', 'o', 'v', 'e', ' ', 'N', 'L', 'P', '!']\n"
     ]
    }
   ],
   "source": [
    "raw_text = \"We love NLP!\"\n",
    "tokens = list(raw_text)\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{' ': 0, '!': 1, 'L': 2, 'N': 3, 'P': 4, 'W': 5, 'e': 6, 'l': 7, 'o': 8, 'v': 9}\n"
     ]
    }
   ],
   "source": [
    "# Numerical encoding of individual character\n",
    "token2idx = {char: idx for idx, char in enumerate(sorted(set(tokens)))}\n",
    "print(token2idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[5, 6, 0, 7, 8, 9, 6, 0, 3, 2, 4, 1]\n"
     ]
    }
   ],
   "source": [
    "# Using token2idx to map our tokenized text to integers\n",
    "integer_tokens = [token2idx[token] for token in tokens]\n",
    "print(integer_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([12, 10])"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# One-hot encoding the numbers\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "\n",
    "integer_tokens = torch.tensor(integer_tokens)\n",
    "one_hot_encode_tokens = F.one_hot(integer_tokens, num_classes=len(token2idx))\n",
    "one_hot_encode_tokens.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token = W\n",
      "Integer Encoded Token = 5\n",
      "One hot encoded Token = tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])\n"
     ]
    }
   ],
   "source": [
    "print(f\"Token = {tokens[0]}\")\n",
    "print(f\"Integer Encoded Token = {integer_tokens[0]}\")\n",
    "print(f\"One hot encoded Token = {one_hot_encode_tokens[0]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2️⃣ Word tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['We', 'love', 'NLP!']\n"
     ]
    }
   ],
   "source": [
    "# Splitting raw text based on whitespaces\n",
    "word_tokens = raw_text.split()\n",
    "print(word_tokens)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3️⃣ Subword Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_ckpt = 'distilbert-base-uncased'\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DistilBertTokenizer\n",
    "distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [101, 2057, 2293, 17953, 2361, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}\n"
     ]
    }
   ],
   "source": [
    "# Lets see the tokenizer in action now \n",
    "encoded_text = tokenizer(raw_text)\n",
    "print(encoded_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['[CLS]', 'we', 'love', 'nl', '##p', '!', '[SEP]']\n"
     ]
    }
   ],
   "source": [
    "tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3️⃣ Tokenizing entire Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset emotion (/Users/pachaar/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ee4202f445ef41b7bd91cc51ccfe27dc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 16000\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 2000\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 2000\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we will load the tweet emotions dataset\n",
    "tweet_emotions = load_dataset(\"emotion\")\n",
    "tweet_emotions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's define a function for tokenization\n",
    "def tokenize(batch):\n",
    "    return tokenizer(batch[\"text\"], padding=True, truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}\n"
     ]
    }
   ],
   "source": [
    "print(tokenize(tweet_emotions[\"train\"][:2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e3e83bd4e2f4328bbda2dcba85d46af",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "01780754e110423da5df6e8294b5ea39",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "918f15064a20417d92d7c42d389e2646",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['text', 'label', 'input_ids', 'attention_mask']\n"
     ]
    }
   ],
   "source": [
    "# Applying tokenization across entire data set\n",
    "tweet_emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)\n",
    "\n",
    "print(tweet_emotions_encoded['test'].column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env_twitter",
   "language": "python",
   "name": "env_twitter"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
